We then randomly choose a small grid, and use the $X$ and $Y$ position to predict this grid belongs to high or low density.
$N$ is the number of synapses (observations)
Classification methods:
In [23]:
# Import packages
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as ss
from scipy.stats import norm
import seaborn as sns
from sklearn import cross_validation
from sklearn.cross_validation import LeaveOneOut
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
np.random.seed(12345678) # for reproducibility, set random seed
# Read in data
df = pd.read_csv('../output.csv')
nvox = 64*64*48 # assume number of voxels per bin
df['weighted'] = df['synapses']/df['unmasked']*nvox
xvals = df['cx'].unique()
yvals = df['cy'].unique()
zvals = df['cz'].unique()
# print xvals
# print yvals
# print zvals
# plt.hist(df['cx'], weights=df['unmasked']/(nvox*len(yvals)*len(zvals)),
# bins=len(xvals), edgecolor='none')
# plt.xlabel('X coordinate')
# plt.ylabel('Fraction unmasked voxels')
# plt.title('Original data')
# plt.show()
# plt.hist(df['cy'], weights=df['unmasked']/(nvox*len(xvals)*len(zvals)),
# bins=len(yvals), edgecolor='none')
# plt.xlabel('Y coordinate')
# plt.ylabel('Fraction unmasked voxels')
# plt.title('Original data')
# plt.show()
In [24]:
# Get rid of the blank edges, Z-layer by Z-layer.
df['edge'] = 0
for z in zvals:
this_z = df[df['cz']==z]
# X direction
xhist, bin_edges = np.histogram(this_z['cx'], weights = this_z['unmasked']/(nvox*len(yvals)), bins=len(xvals))
left = np.argmax(xhist>0.5)
right = len(xvals)-np.argmax(xhist[::-1]>0.5)
df.loc[(df['cz']==z) & ((df['cx']<xvals[left]) | (df['cx']>=xvals[right])),'edge'] = 1
# Y direction
yhist, bin_edges = np.histogram(this_z['cy'], weights = this_z['unmasked']/(nvox*len(xvals)), bins=len(yvals))
top = np.argmax(yhist>0.5)
bottom = len(yvals)-np.argmax(yhist[::-1]>0.5)
df.loc[(df['cz']==z) & ((df['cy']<yvals[top]) | (df['cy']>=yvals[bottom])),'edge'] = 1
print 'We labeled',np.sum(df['edge']==1),'bins as edges.'
In [25]:
# Copy new dataset without edges
df2 = df[df['edge']==0].copy()
df2 = df2.drop('edge', axis=1)
print df2.head()
In [26]:
import pickle
fs = 18
lfs = 14
# Saving random state so that we can generate the same grids later
rnd_state = np.random.get_state()
with open('random_state_update.pickle', 'w') as f:
pickle.dump(rnd_state, f)
# Generating dataset of 5x5 grids
grids = []
grid_x = []
grid_y = []
min_x = np.max([df2[df2['cz']==z]['cx'].min() for z in zvals])
max_x = np.min([df2[df2['cz']==z]['cx'].max() for z in zvals])
min_y = np.max([df2[df2['cz']==z]['cy'].min() for z in zvals])
max_y = np.min([df2[df2['cz']==z]['cy'].max() for z in zvals])
left = np.argmax(xvals==min_x)+2
right = np.argmax(xvals==max_x)-2
bottom = np.argmax(yvals==min_y)+2
top = np.argmax(yvals==max_y)-2
for i in range(100):
xi = np.random.randint(left, right+1)
yi = np.random.randint(bottom, top+1)
for z in zvals:
newgrid = df2[
(df2['cx']>=xvals[xi-2]) & (df2['cx']<=xvals[xi+2]) &
(df2['cy']>=yvals[yi-2]) & (df2['cy']<=yvals[yi+2]) &
(df2['cz'] == z)
]
grids.append(newgrid)
grid_x.append(xvals[xi])
grid_y.append(yvals[yi])
# plot examples for one Z layer
if (z==55 and i<1):
exampleXY = pd.pivot_table(grids[i], index='cy', columns='cx', values='weighted', aggfunc=np.sum)
plt.figure()
sns.heatmap(exampleXY, cbar_kws={'label': 'Weighted number of synapses'});
plt.title('Example X-Y grid', fontsize = fs);
plt.xlabel('Cortical x-coordinate', fontsize = fs)
plt.ylabel('Cortical y-coordinate', fontsize = fs)
plt.tick_params(axis='both', which='major', labelsize=lfs)
cax = plt.gcf().axes[-1]
cax.tick_params(labelsize=lfs)
plt.tight_layout()
plt.savefig('./figs/AndrewFigs/exampleGrid.png', format='png', dpi=300)
plt.show()
In [27]:
# Mean density by Z layer
grid_means = np.array([g['weighted'].mean() for g in grids])
grid_z = np.array([g['cz'].mean() for g in grids])
z_means = np.array([df2[df2['cz']==z]['weighted'].mean() for z in zvals])
plt.figure()
plt.boxplot([grid_means[grid_z==z] for z in zvals], vert=False, positions=zvals, widths=60)
Z_pts = plt.scatter(z_means, zvals, label='entire Z layer', s=60, color='r', marker='x', linewidths=2)
plt.ylim(0,1200)
plt.xlabel('Weighted Synapse Denseity', fontsize = fs)
plt.ylabel('Z layer', fontsize = fs)
plt.title('Distribution of 5x5 grid synapse\ndensities by layer',fontsize = fs)
plt.tick_params(axis='both', which='major', labelsize=lfs)
plt.legend()
plt.tight_layout()
plt.savefig('./figs/AndrewFigs/gridDistByLayer.png', format='png', dpi=300)
plt.show()
In [28]:
# Mean density by Z mean
grid_z_mean = np.array([df2[df2['cz']==g['cz'].mean()]['weighted'].mean() for g in grids])
plt.figure()
grid_pts = plt.scatter(grid_means, grid_z_mean, label='5x5 grid')
plt.title('')
plt.xlabel('Synapse Density')
plt.ylabel('Z mean')
plt.show()
In [29]:
# Split into high density vs. low density layers
from sklearn.cluster import KMeans
est = KMeans(n_clusters=2)
est.fit(z_means.reshape(len(z_means),1))
labels = est.labels_
# Save grid data
with open('Z_labels_update.pickle', 'w') as f:
pickle.dump([zvals, labels], f)
plt.figure()
pts0 = plt.scatter(z_means[labels==0], zvals[labels==0], s=60, facecolor='steelblue')
pts1 = plt.scatter(z_means[labels==1], zvals[labels==1], s=60, facecolor='darkseagreen')
pts0.set_label('Group 0')
pts1.set_label('Group 1')
plt.xlabel('Synapse Density',fontsize = fs)
plt.ylabel('Z layer',fontsize = fs)
plt.title('K-Means clustering of mean-grid\ndensities across Z-layers',fontsize = fs)
plt.tick_params(axis='both', which='major', labelsize=lfs)
plt.legend()
plt.tight_layout()
plt.savefig('./figs/AndrewFigs/kmeansGrid.png', format='png', dpi=300)
plt.show()
In [30]:
# Plot distributions of our dataset
grid_labels = np.array([ labels[ np.argmax( zvals==g['cz'].mean() ) ] for g in grids ])
g0 = grid_means[np.where(grid_labels==0)]
g1 = grid_means[np.where(grid_labels==1)]
n0 = len(g0)
n1 = len(g1)
p0 = float(n0)/(n0+n1)
p1 = float(n1)/(n0+n1)
plt.figure()
ax = sns.distplot(g0, kde=False, fit=norm, label='Group 0', hist_kws={'edgecolor': 'none'})
sns.distplot(g1, kde=False, fit=norm, label='Group 1', hist_kws={'edgecolor': 'none'})
# weight fit by priors
fit0 = (ax.lines[0].get_data()[0], p0*ax.lines[0].get_data()[1])
ax.lines[0].set_data(fit0)
fit1 = (ax.lines[1].get_data()[0], p1*ax.lines[1].get_data()[1])
ax.lines[1].set_data(fit1)
# weight histograms by priors
scale = -1
for r in ax.patches:
if r.get_label() == 'Group 0':
scale = p0
elif r.get_label() == 'Group 1':
scale = p1
if scale != -1:
h = r.get_height()
r.set_height(scale*h)
ax.set_ylim(0,0.009)
plt.xlabel('Weighted Synapse Density',fontsize = fs)
plt.ylabel('Probability', fontsize = fs)
plt.title('PDF of Weighted Synapse Density Across Both Groups', fontsize = fs)
plt.tick_params(axis='both', which='major', labelsize=lfs)
plt.legend()
plt.tight_layout()
plt.savefig('./figs/AndrewFigs/pdf.png', format='png', dpi=300)
plt.show()
In [31]:
# Run classification algorithms on dataset
names = ["Nearest Neighbors", "Linear SVM", "Random Forest",
"Linear Discriminant Analysis", "Quadratic Discriminant Analysis"]
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.001), # decreased C to improve computation time
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
LinearDiscriminantAnalysis(),
QuadraticDiscriminantAnalysis()]
features = grid_means.reshape(-1,1)
y = grid_labels
accuracy=np.zeros((len(classifiers),2))
for idx, cla in enumerate(classifiers):
loo = LeaveOneOut(len(features))
scores = cross_validation.cross_val_score(cla, features, y, cv=loo)
accuracy[idx,] = [scores.mean(), ss.sem(scores)]
print("Accuracy of %s: %0.2f (+/- %0.2f)" % (names[idx], scores.mean(), ss.sem(scores)))
The five classifiers tested had accuracies between 75-80%, which is better than chance. However, these numbers are only slightly at or above the maximum prior probability of 73%. This means that our classifiers are only slightly better than just choosing the class with the maximum prior 100% of the time, assuming we trust the priors. Taking into account the observed synapse density provides little added information, which is not surprising given the large overlap in observed densities from the two classes. If we wanted to distinguish between similar populations of Z layers but with different priors from another dataset, our accuracy would decrease accordingly.
In [32]:
# Save grid data
with open('grid_data_update.pickle', 'w') as f:
pickle.dump([grid_means, grid_x, grid_y, grid_z], f)
In [ ]: